Read data ##################

library(NHANES)
library(dplyr)
df0 <- NHANES
df <- NHANES[NHANES$Age >= 18 & NHANES$Age < 60,]
df <- df[!duplicated(df),]
  1. Data cleaning ########################################

Create a subset with only varaibles of interest: Age, Comorbidity1, CurrentSmoker, Depression, Fatalism, HiChol, Htn, NIHScore, Optimism, Pessimism, R_E, Sex, Spirituality

data2 <- df %>% select(
  SleepHrsNight,
  BMI,
  Age,
  Gender,
  Race1,
  TotChol,
  BPDiaAve,
  BPSysAve,
  AlcoholYear,
  Poverty,
  DaysMentHlthBad,
  UrineFlow1,
  PhysActive,
  DaysPhysHlthBad,
  Smoke100,
  HealthGen
)

“CurrentSmoker”, “HiChol”, “Htn”, “R_E”, “Sex” are binary variables, so set them as factors

data type

data2$Gender <- ifelse(data2$Gender == "male", 0, 1)
data2$Smoke100 <- ifelse(data2$Smoke100 == "No", 0, 1)
data2$PhysActive <- ifelse(data2$PhysActive == "No", 0, 1)
data2 <- data2 %>%
  mutate(
    Race1 = case_when(
      Race1 == 'Black' ~ 1,
      Race1 == 'Hispanic' ~ 2,
      Race1 == 'Mexican' ~ 3,
      Race1 == 'White' ~ 4,
      Race1 == 'Other' ~ 5,
      TRUE ~ NA_integer_  # Default value if none of the conditions are met
    )
  )
data2 <- data2 %>%
  mutate(
    HealthGen = case_when(
      HealthGen == 'Poor' ~ 1,
      HealthGen == 'Fair' ~ 2,
      HealthGen == 'Good' ~ 3,
      HealthGen == 'Vgood' ~ 4,
      HealthGen == 'Excellent' ~ 5,
      TRUE ~ NA_integer_  # Default value if none of the conditions are met
    )
  )
data2$PhysActive = as.factor(data2$PhysActive)
data2$Smoke100 = as.factor(data2$Smoke100)
data2$HealthGen = as.factor(data2$HealthGen)
data2$Race1 = as.factor(data2$Race1)
data2$Gender = as.factor(data2$Gender)

Create an indicator variable “Incomplete” (0 if all variables are complete; 1 if at least one variable is missing)

miss_num = apply(data2, 1, function(x) sum(is.na(x)))
data2$Incomplete = ifelse(miss_num>0, 1, 0)
data2$Incomplete = as.factor(data2$Incomplete)
  1. Compare complete to non-complete cases with graphs and t/χ2 tests ################

Create two categorical variables

  1. Create “Age_4Cat” with 3 levels (0/1/2/3):20, 40, 60
data2$Age_4Cat = cut(data2$Age, c(20, 40, 60, max(data2$Age, na.rm = T)), right=T, labels=c(0:2))
  1. Create “BMI_cat” with 3 levels (0/1/2/3): 18, 25, 30
data2$BMI_cat = cut(data2$BMI, c( 20, 40, 60, max(data2$BMI, na.rm = T)), right=F, labels=c(0:2))
  1. Create a new dataset with only complete data; Create another new dataset with only incomplete data ################
data_complete = subset(data2, data2$Incomplete == 0)
data_incomplete = subset(data2, data2$Incomplete == 1)
  1. null hypothesis: H0:p1=p2 Ha: H1:\(p1 \neq p2\); Decision: Do not reject H0
n1f=2710
p1=sum(data_complete$BMI)/n1f
n2f=1509
p2=sum(data_incomplete$BMI)/n2f
f_diff=p1-p2
se_f=sqrt(p1*(1-p1)/n1f+p2*(1-p2)/n2f)
z <- f_diff/se_f
p_value_f <- 2 * (1 - pnorm(abs(z)))
low_bound_diff_f <- f_diff - 1.96 * se_f
high_bound_diff_f <- f_diff + 1.96 * se_f
  1. Use t-tests (χ2 tests) to compare differences in the mean (or proportion) of a given variable comparing complete cases to incomplete cases

Continuous variables: get mean(sd) and p-values from two-sample t-tests

Prepare data

Cont_vars = c( "BMI",
               "SleepHrsNight",
               "Age",
               "TotChol",
               "BPDiaAve",
               "BPSysAve",
               "AlcoholYear",
               "DaysPhysHlthBad",
               "Poverty",
               "UrineFlow1",
               "DaysMentHlthBad"
)
Cont_complete = subset(data_complete, select = Cont_vars)
Cont_incomplete = subset(data_incomplete, select = Cont_vars)
Cont_incomplete_numeric <- as.data.frame(lapply(Cont_incomplete, as.numeric))
Cont_complete_numeric <- as.data.frame(lapply(Cont_complete, as.numeric))

Get mean and sd for complete cases

Cont_complete_mean = apply(Cont_complete_numeric, 2, mean)
Cont_complete_sd = apply(Cont_complete_numeric, 2, sd)

Get mean and sd for incomplete cases

Cont_incomplete_mean = apply(Cont_incomplete_numeric, 2, mean, na.rm = T)
Cont_incomplete_sd = apply(Cont_incomplete_numeric, 2, sd, na.rm = T)

Count # of complete cases for each continuous variable seperately in all complete cases dataset and incomplete cases dataset

Cont_Number_complete = apply(Cont_complete_numeric, 2, function(x) sum(complete.cases(x)))
Cont_Number_incomplete = apply(Cont_incomplete_numeric, 2, function(x) sum(complete.cases(x)))

Determine whether the variance from complete cases and variance from incomplete cases are equal or not

Cont_F_P_value = sapply(1:length(Cont_vars), function(x) var.test(Cont_complete_numeric[,x], Cont_incomplete_numeric[,x], alternative = "two.sided")$p.value>=0.05)

Get p-values from two-sample t-tests

Cont_P_value = sapply(1:length(Cont_vars), function(x) t.test(Cont_complete_numeric[,x], Cont_incomplete_numeric[,x], var.equal = Cont_F_P_value[x])$p.value)

Generate the summary table used to fill in Table1

Cont_summary = data.frame(cbind(Cont_Number_complete, Cont_complete_mean, Cont_complete_sd,
                                Cont_Number_incomplete, Cont_incomplete_mean, Cont_incomplete_sd,
                                Cont_P_value))
format(Cont_summary, nsmall = 2) ## only keep the first two decimals

Categorical/binary variables: get proportion and p-values from χ2 tests

Binary variables

Prepare data

Bi_vars = c("Gender", "Smoke100",  "PhysActive")
Bi_complete = subset(data_complete, select = Bi_vars)
Bi_incomplete = subset(data_incomplete, select = Bi_vars)
Bi_data = subset(data2, select = c(Bi_vars, "Incomplete"))

Count # of complete cases for each binary variable seperately in all complete cases dataset and incomplete cases dataset

Bi_Number_complete = apply(Bi_complete, 2, function(x) sum(complete.cases(x)))
Bi_Number_incomplete = apply(Bi_incomplete, 2, function(x) sum(complete.cases(x)))

Calculate the proportions of cases with level=1 for each binary variable seperately in all complete cases dataset and incomplete cases dataset

Bi_prop_complete = sapply(1:length(Bi_vars), function(x) prop.table(table(Bi_complete[,x]))[2])
Bi_prop_incomplete = sapply(1:length(Bi_vars), function(x) prop.table(table(Bi_incomplete[,x]))[2])

Get p-values from χ2 tests

Bi_P_value = sapply(Bi_vars, function(x) chisq.test(table(Bi_data[[x]], Bi_data$Incomplete), correct = FALSE)$p.value)

Generate the summary table used to fill in Table1

Bi_summary = data.frame(cbind(Bi_Number_complete, Bi_prop_complete, Bi_Number_incomplete,
                              Bi_prop_incomplete, Bi_P_value))
format(Bi_summary, nsmall = 2) ## only keep the first two decimals

Categorical variables

Prepare data

Cat_vars = c("HealthGen", "Race1")
Cat_complete = subset(data_complete, select = Cat_vars)
Cat_incomplete = subset(data_incomplete, select = Cat_vars)
Cat_data = subset(data2, select = c(Cat_vars, "Incomplete"))

Count # of complete cases for each categorical variable seperately in all complete cases dataset and incomplete cases dataset

Cat_Number_complete = apply(Cat_complete, 2, function(x) sum(complete.cases(x)))
Cat_Number_incomplete = apply(Cat_incomplete, 2, function(x) sum(complete.cases(x)))

Calculate the proportions of each level for each categorical variable seperately in all complete cases dataset and incomplete cases dataset

Cat_prop_complete = t(apply(Cat_complete, 2, function(x) prop.table(table(x))))
Cat_prop_incomplete = t(apply(Cat_incomplete, 2, function(x) prop.table(table(x))))

Get p-values from χ2 tests

Cat_P_value = sapply(1:length(Cat_vars), function(x) chisq.test(table(Cat_data[[Cat_vars[x]]], Cat_data$Incomplete), correct = F)$p.value)

Generate the summary table used to fill in Table1

Cat_summary = data.frame(cbind(Cat_Number_complete, Cat_prop_complete,
                               Cat_Number_incomplete,
                               Cat_prop_incomplete, Cat_P_value), check.names = F)
format(Cat_summary, nsmall = 2) ## only keep the first two decimals

Basic characteritics ########################################

rm(list = ls())
gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  745375 39.9    2364357 126.3  2364357 126.3
## Vcells 1799700 13.8    8388608  64.0  6847622  52.3
set.seed(123)
library(car)
library(olsrr)
library(ggplot2)
library(lmtest)
  1. Data cleaning ########################################

select variables

library(NHANES)
df0 <- NHANES
df <- NHANES[NHANES$Age >= 18 & NHANES$Age < 60,]

colSums(is.na(df)) / nrow(df)

df <- df[, which(colSums(is.na(df)) / nrow(df) < 0.3)]

exclude duplication

df <- df[!duplicated(df),]
names(df)
##  [1] "ID"              "SurveyYr"        "Gender"          "Age"            
##  [5] "AgeDecade"       "Race1"           "Education"       "MaritalStatus"  
##  [9] "HHIncome"        "HHIncomeMid"     "Poverty"         "HomeRooms"      
## [13] "HomeOwn"         "Work"            "Weight"          "Height"         
## [17] "BMI"             "BMI_WHO"         "Pulse"           "BPSysAve"       
## [21] "BPDiaAve"        "BPSys1"          "BPDia1"          "BPSys2"         
## [25] "BPDia2"          "BPSys3"          "BPDia3"          "DirectChol"     
## [29] "TotChol"         "UrineVol1"       "UrineFlow1"      "Diabetes"       
## [33] "HealthGen"       "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest" 
## [37] "Depressed"       "SleepHrsNight"   "SleepTrouble"    "PhysActive"     
## [41] "Alcohol12PlusYr" "AlcoholYear"     "Smoke100"        "Smoke100n"      
## [45] "Marijuana"       "RegularMarij"    "HardDrugs"       "SexEver"        
## [49] "SexAge"          "SexNumPartnLife" "SexNumPartYear"  "SameSex"        
## [53] "SexOrientation"

df$BPSysAve

library(dplyr)
df2 <- df %>% select(
  SleepHrsNight,
  BMI,
  Age,
  Gender,
  Race1,
  TotChol,
  BPDiaAve,
  BPSysAve,
  AlcoholYear,
  Poverty,
  DaysMentHlthBad,
  UrineFlow1,
  PhysActive,
  DaysPhysHlthBad,
  Smoke100,
  HealthGen
)
df3 <- na.omit(df2)
library(dplyr)
  1. Data categorize ########################################
library(gtsummary)
library(kableExtra)
df3 <- df3 %>%
  mutate(BMIcat = case_when(
    BMI < 18  ~ "<18",
    BMI >= 18 & BMI <= 30 ~ "18-30",
    BMI > 30  ~ ">30"
  ),
  BMIcat = factor(BMIcat, levels = c("<18", "18-30", ">30")))
  1. generate table ########################################
table_basic <- df3 %>%
  tbl_summary(
    by = BMIcat,
    statistic = list(all_continuous()  ~ "{mean} ({sd})",
                     all_categorical() ~ "{n}    ({p}%)"),
    digits = list(all_continuous()  ~ c(2, 2),
                  all_categorical() ~ c(0, 1)),
    missing = "no"
  ) %>%
  modify_header(
    label = "**Variable**",
    all_stat_cols() ~ "**{level}**<br>N = {n} ({style_percent(p, digits=1)}%)"
  ) %>%
  modify_caption("Participant characteristics, by BMI category") %>%
  bold_labels() %>%
  add_overall(col_label = "**All participants**<br>N = {N}") %>%
  add_p(
    test = list(
      all_continuous() ~ "kruskal.test",
      all_categorical() ~ "chisq.test"
    ),
    pvalue_fun = function(x) style_pvalue(x, digits = 3)
  ) %>%
  modify_footnote(
    all_stat_cols() ~ "p-values from Kruskal-Wallis rank sum test"
  )
## Warning for variable 'Race1':
## simpleWarning in stats::chisq.test(x = structure(c(4L, 4L, 4L, 4L, 4L, 4L, 2L, : Chi-squared approximation may be incorrect
## Warning for variable 'HealthGen':
## simpleWarning in stats::chisq.test(x = structure(c(3L, 2L, 4L, 3L, 4L, 3L, 3L, : Chi-squared approximation may be incorrect

View the table

# print(table_basic)
  1. convert to latex ########################################

Assuming ‘table1’ is already created using gtsummary

Convert to a kableExtra object

kable_table <- as_kable(table_basic)

Assuming ‘kable_table’ is already created using gtsummary

Convert to a kableExtra object

latex_table <- kable_table %>%
  kable_styling(latex_options = c("striped", "hold_position"))

Save the LaTeX table to file

save_kable(file = "table_for_overleaf.tex", latex_table)

MLR & Diagnosis no log ########################################

rm(list = ls())
gc()
##           used (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1133413 60.6    2364357 126.3  2364357 126.3
## Vcells 2446264 18.7    8388608  64.0  6847622  52.3
set.seed(123)
library(car)
library(olsrr)
library(ggplot2)
library(lmtest)
  1. Data cleaning ########################################

select variables

library(NHANES)
df0 <- NHANES
df <- NHANES[NHANES$Age >= 18 & NHANES$Age < 60,]

colSums(is.na(df)) / nrow(df)

df <- df[, which(colSums(is.na(df)) / nrow(df) < 0.3)]

exclude duplication

df <- df[!duplicated(df),]
names(df)
##  [1] "ID"              "SurveyYr"        "Gender"          "Age"            
##  [5] "AgeDecade"       "Race1"           "Education"       "MaritalStatus"  
##  [9] "HHIncome"        "HHIncomeMid"     "Poverty"         "HomeRooms"      
## [13] "HomeOwn"         "Work"            "Weight"          "Height"         
## [17] "BMI"             "BMI_WHO"         "Pulse"           "BPSysAve"       
## [21] "BPDiaAve"        "BPSys1"          "BPDia1"          "BPSys2"         
## [25] "BPDia2"          "BPSys3"          "BPDia3"          "DirectChol"     
## [29] "TotChol"         "UrineVol1"       "UrineFlow1"      "Diabetes"       
## [33] "HealthGen"       "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest" 
## [37] "Depressed"       "SleepHrsNight"   "SleepTrouble"    "PhysActive"     
## [41] "Alcohol12PlusYr" "AlcoholYear"     "Smoke100"        "Smoke100n"      
## [45] "Marijuana"       "RegularMarij"    "HardDrugs"       "SexEver"        
## [49] "SexAge"          "SexNumPartnLife" "SexNumPartYear"  "SameSex"        
## [53] "SexOrientation"

df$BPSysAve

library(dplyr)
df2 <- df %>% select(
  SleepHrsNight,
  BMI,
  Age,
  Gender,
  Race1,
  TotChol,
  BPDiaAve,
  BPSysAve,
  AlcoholYear,
  Poverty,
  DaysMentHlthBad,
  UrineFlow1,
  PhysActive,
  DaysPhysHlthBad,
  Smoke100,
  HealthGen
)
df3 <- na.omit(df2)

df3$SleepHrsNight <- df3$SleepHrsNight * 60

df3 <- df3[, -which(names(df3) %in% “SleepHrsNight”)]

cor(df3$BPSysAve,df3$BPDiaAve)

psych::describe(df3)

psych::pairs.panels(df3)

hist(df3$SleepHrsNight)

colSums(is.na(df2)) / nrow(df2)

fit0 <-
  lm(SleepHrsNight ~ .,
     data = df3)

data type

df3$Gender <- ifelse(df3$Gender == "male", 0, 1)
df3$Smoke100 <- ifelse(df3$Smoke100 == "No", 0, 1)
df3$PhysActive <- ifelse(df3$PhysActive == "No", 0, 1)
df3 <- df3 %>%
  mutate(
    Race1 = case_when(
      Race1 == 'Black' ~ 1,
      Race1 == 'Hispanic' ~ 2,
      Race1 == 'Mexican' ~ 3,
      Race1 == 'White' ~ 4,
      Race1 == 'Other' ~ 5,
      TRUE ~ NA_integer_  # Default value if none of the conditions are met
    )
  )
df3 <- df3 %>%
  mutate(
    HealthGen = case_when(
      HealthGen == 'Poor' ~ 1,
      HealthGen == 'Fair' ~ 2,
      HealthGen == 'Good' ~ 3,
      HealthGen == 'Vgood' ~ 4,
      HealthGen == 'Excellent' ~ 5,
      TRUE ~ NA_integer_  # Default value if none of the conditions are met
    )
  )

model_3 add additional risk factors ##

m_3 = lm(
  BMI ~ SleepHrsNight + Age + Gender + factor(Race1)  + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
    DaysPhysHlthBad + factor(HealthGen) + PhysActive,
  df3
)
summary(m_3)
## 
## Call:
## lm(formula = BMI ~ SleepHrsNight + Age + Gender + factor(Race1) + 
##     Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + 
##     UrineFlow1 + DaysMentHlthBad + DaysPhysHlthBad + factor(HealthGen) + 
##     PhysActive, data = df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.617  -4.039  -0.661   3.256  35.707 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        27.324898   1.832007  14.915  < 2e-16 ***
## SleepHrsNight      -0.117162   0.103681  -1.130 0.258587    
## Age                 0.003943   0.013319   0.296 0.767244    
## Gender              0.407763   0.280402   1.454 0.146030    
## factor(Race1)2     -1.940769   0.619152  -3.135 0.001744 ** 
## factor(Race1)3     -1.315083   0.542230  -2.425 0.015374 *  
## factor(Race1)4     -1.510239   0.410939  -3.675 0.000243 ***
## factor(Race1)5     -3.041375   0.605088  -5.026 5.40e-07 ***
## Poverty             0.056116   0.088919   0.631 0.528047    
## TotChol             0.008566   0.131873   0.065 0.948215    
## BPDiaAve            0.061458   0.013402   4.586 4.77e-06 ***
## BPSysAve            0.046897   0.011244   4.171 3.15e-05 ***
## AlcoholYear        -0.008133   0.001470  -5.532 3.55e-08 ***
## Smoke100           -0.912564   0.280827  -3.250 0.001173 ** 
## UrineFlow1         -0.086054   0.138360  -0.622 0.534033    
## DaysMentHlthBad    -0.037214   0.017514  -2.125 0.033716 *  
## DaysPhysHlthBad     0.019369   0.020325   0.953 0.340703    
## factor(HealthGen)2 -2.921984   0.974038  -3.000 0.002731 ** 
## factor(HealthGen)3 -4.504202   0.965175  -4.667 3.24e-06 ***
## factor(HealthGen)4 -6.258420   0.990705  -6.317 3.21e-10 ***
## factor(HealthGen)5 -8.167162   1.044199  -7.821 8.00e-15 ***
## PhysActive         -0.778610   0.286658  -2.716 0.006655 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.21 on 2225 degrees of freedom
## Multiple R-squared:  0.1534, Adjusted R-squared:  0.1454 
## F-statistic:  19.2 on 21 and 2225 DF,  p-value: < 2.2e-16
car::Anova(m_3, type = "III")

model 3 diagnosis ###########

par(mfrow = c(2, 3)) #read more from ?plot.lm
plot(m_3, which = 1)
plot(m_3, which = 2)
plot(m_3, which = 3)
plot(m_3, which = 4)
plot(m_3, which = 5)
plot(m_3, which = 6)

par(mfrow = c(1, 1)) # reset
m_3.yhat = m_3$fitted.values
m_3.res = m_3$residuals
m_3.h = hatvalues(m_3)
m_3.r = rstandard(m_3)
m_3.rr = rstudent(m_3)

which subject is most outlying with respect to the x space

Hmisc::describe(m_3.h)
## m_3.h 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2247        0     2247        1 0.009791 0.005107 0.004459 0.004936 
##      .25      .50      .75      .90      .95 
## 0.006261 0.008725 0.011708 0.015200 0.018498 
## 
## lowest : 0.002814313 0.002982653 0.003117284 0.003208262 0.003227126
## highest: 0.038012401 0.042497096 0.048833386 0.048889933 0.055461319
m_3.h[which.max(m_3.h)]
##        422 
## 0.05546132

Assumption:LINE ##############################

(1)Linear: 2 approaches

partial regression plots

car::avPlots(m_3)

(2)Independence:

residuals <- resid(m_3)
acf(residuals, main = "Autocorrelation Function of Residuals")

pacf(residuals, main = "Partial Autocorrelation Function of Residuals")

dw_test <- dwtest(m_3)
print(dw_test)
## 
##  Durbin-Watson test
## 
## data:  m_3
## DW = 2.0074, p-value = 0.5688
## alternative hypothesis: true autocorrelation is greater than 0

(3)E: constant var: residuals-fitted values; transform for variance-stable…(total: 4 solutions)

car::residualPlots(m_3, type = "response")

##                   Test stat Pr(>|Test stat|)    
## SleepHrsNight       -0.4976        0.6188065    
## Age                 -3.8434        0.0001247 ***
## Gender              -0.6793        0.4970226    
## factor(Race1)                                   
## Poverty             -1.1801        0.2380940    
## TotChol             -1.5798        0.1142852    
## BPDiaAve             0.5367        0.5915115    
## BPSysAve            -3.6070        0.0003165 ***
## AlcoholYear          1.9234        0.0545555 .  
## Smoke100             0.0234        0.9813551    
## UrineFlow1          -0.3883        0.6978492    
## DaysMentHlthBad     -0.3539        0.7234147    
## DaysPhysHlthBad      0.5682        0.5699655    
## factor(HealthGen)                               
## PhysActive          -0.2079        0.8353039    
## Tukey test           0.4000        0.6891547    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(m_3, which = 1)

or

ggplot(m_3, aes(x = m_3.yhat, y = m_3.res)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             color = "black") +
  labs(title = "constant variance assumption",
       x = "y hat",
       y = "Residuals") +
  theme_minimal()

conclusion: the constant variance assumption is basically not violated. The spread of the residuals appears to be fairly uniform across the range of predicted values, the assumption is more likely to hold

(4)Normality: residuals freq - residuals (4 plots: his, box, Q-Q, shapiro); transform

exam quartiles of the residuals

Hmisc::describe(m_3.res)
## m_3.res 
##         n   missing  distinct      Info      Mean       Gmd       .05       .10 
##      2247         0      2247         1 8.085e-17      6.67   -8.6246   -6.9209 
##       .25       .50       .75       .90       .95 
##   -4.0390   -0.6607    3.2560    7.4840   10.6204 
## 
## lowest : -16.61655 -14.99224 -14.93896 -13.77626 -13.40159
## highest:  28.87262  31.61046  33.88930  33.95985  35.70682
Hmisc::describe(m_3.res)$counts[c(".25", ".50", ".75")] #not symmetric
##       .25       .50       .75 
## "-4.0390" "-0.6607" " 3.2560"

histogram

par(mfrow = c(1, 1))
hist(m_3.res, breaks = 15)

Q-Q plot

qq.m_3.res = car::qqPlot(m_3.res)

m_3.res[qq.m_3.res]
##     1943      910 
## 35.70682 33.95985

influential observations #################

influence2 = data.frame(
  Residual = resid(m_3),
  Rstudent = rstudent(m_3),
  HatDiagH = hat(model.matrix(m_3)),
  CovRatio = covratio(m_3),
  DFFITS = dffits(m_3),
  COOKsDistance = cooks.distance(m_3)
)

DFFITS

ols_plot_dffits(m_3)

influence2[order(abs(influence2$DFFITS), decreasing = T), ] %>% head()

From the plot above, we can see 2 observations with the largest (magnitude) of DFFITS, observation 879 and 1769 By printing the corresponding values of DFFITS in the output dataset, we can obtain their DFFITS values: 0.5673 for observation 879, 0.5872 for observation 1769

Cook’s D

ols_plot_cooksd_bar(m_3)

influence2[order(influence2$COOKsDistance, decreasing = T), ] %>% head()

From the plot above, we can see that the observation 879 and 1769 also have the largest Cook’s Distance. By printing the corresponding values of Cook’s D in the output dataset, we can obtain their Cook’s D values:0.0108 for observation 879, 0.0145 for observation 1769

leverage

ols_plot_resid_lev(m_3)

high leverage

influence2[order(influence2$HatDiagH, decreasing = T), ] %>% head()

high studentized residual

influence2[order(influence2$Rstudent, decreasing = T), ] %>% head()

From the plot above, we can see that the observation 1155 has the largest leverage (0.0368). Observations 1862 has the largest (in magnitude) externally studentized residual (5.9649).

From the plot above, there is 7 observations(1048,1769,1684, 74, 72, 1689, 1311) located in the intersection areas of both outlier and leverage, which is to say, those observations has both the leverage and the externally studentized residual exceeding their respective thresholds.Due to its large DIFFITS and Cook’s D, they are potentially influential observations.

The thresholds for the externally studentized residual are -2 and 2, i.e. 2 in magnitude. The thresholds for the leverage of the R default is 0.011

From (DFFITS), observations 879 and 1769 appear to be influential observations. Observation 1155 has extraordinarily large leverage. Therefore, I choose to remove these 14 observations in the re-fitted mode

rm3.df3 = df3[-c(170, 208, 444, 926, 1361, 1454, 1546, 1655, 1910, 1958), ]
rm.m_3 =  lm(
  BMI ~ SleepHrsNight + Age + Gender + Race1  + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
    DaysPhysHlthBad + HealthGen + PhysActive,
  rm3.df3
)

Before removing these observations, the estimated coefficients are:

summary(m_3)$coef
##                        Estimate  Std. Error     t value     Pr(>|t|)
## (Intercept)        27.324897955 1.832007232 14.91527843 5.063682e-48
## SleepHrsNight      -0.117162147 0.103680975 -1.13002551 2.585873e-01
## Age                 0.003942574 0.013318707  0.29601778 7.672441e-01
## Gender              0.407763087 0.280402482  1.45420641 1.460301e-01
## factor(Race1)2     -1.940768951 0.619151591 -3.13456184 1.743558e-03
## factor(Race1)3     -1.315082879 0.542229766 -2.42532402 1.537382e-02
## factor(Race1)4     -1.510238608 0.410939027 -3.67509170 2.433861e-04
## factor(Race1)5     -3.041375233 0.605088431 -5.02633182 5.398260e-07
## Poverty             0.056115667 0.088918729  0.63108940 5.280469e-01
## TotChol             0.008565966 0.131872653  0.06495635 9.482146e-01
## BPDiaAve            0.061457725 0.013401526  4.58587505 4.771754e-06
## BPSysAve            0.046897025 0.011243719  4.17095295 3.149657e-05
## AlcoholYear        -0.008132748 0.001470236 -5.53159289 3.545838e-08
## Smoke100           -0.912563541 0.280827055 -3.24955707 1.173076e-03
## UrineFlow1         -0.086054222 0.138360108 -0.62195833 5.340330e-01
## DaysMentHlthBad    -0.037213746 0.017514270 -2.12476717 3.371600e-02
## DaysPhysHlthBad     0.019369197 0.020324834  0.95298182 3.407028e-01
## factor(HealthGen)2 -2.921983768 0.974038001 -2.99986629 2.730955e-03
## factor(HealthGen)3 -4.504202045 0.965175428 -4.66671852 3.242377e-06
## factor(HealthGen)4 -6.258419551 0.990705138 -6.31713646 3.208005e-10
## factor(HealthGen)5 -8.167161763 1.044199344 -7.82145843 7.997961e-15
## PhysActive         -0.778610338 0.286657981 -2.71616487 6.655396e-03

After removing these observations, the estimated coefficients are:

summary(rm.m_3)$coef
##                     Estimate  Std. Error      t value     Pr(>|t|)
## (Intercept)     27.931772229 1.610013332  17.34878319 2.493565e-63
## SleepHrsNight   -0.149306764 0.102448360  -1.45738559 1.451513e-01
## Age              0.007538154 0.013092151   0.57577659 5.648245e-01
## Gender           0.437837920 0.275071652   1.59172316 1.115892e-01
## Race1           -0.494790939 0.116629002  -4.24243481 2.301832e-05
## Poverty          0.080793680 0.086489789   0.93414125 3.503326e-01
## TotChol          0.007081371 0.129713646   0.05459233 9.564682e-01
## BPDiaAve         0.066739760 0.013403431   4.97930420 6.872480e-07
## BPSysAve         0.043740703 0.011169838   3.91596587 9.275883e-05
## AlcoholYear     -0.007609936 0.001445499  -5.26457270 1.540361e-07
## Smoke100        -0.824668424 0.275771138  -2.99040875 2.816661e-03
## UrineFlow1      -0.094603489 0.136349841  -0.69382911 4.878619e-01
## DaysMentHlthBad -0.040870114 0.017274127  -2.36597271 1.806818e-02
## DaysPhysHlthBad  0.015732946 0.019579489   0.80354219 4.217474e-01
## HealthGen       -1.674247750 0.156736230 -10.68194477 5.230180e-26
## PhysActive      -0.749311170 0.282521486  -2.65222720 8.053167e-03

change percent

abs((rm.m_3$coefficients - m_3$coefficients) / (m_3$coefficients) * 100)
##        (Intercept)      SleepHrsNight                Age             Gender 
##       2.220957e+00       2.743601e+01       9.119879e+01       7.375565e+00 
##     factor(Race1)2     factor(Race1)3     factor(Race1)4     factor(Race1)5 
##       7.450542e+01       1.061436e+02       1.004689e+02       1.021944e+02 
##            Poverty            TotChol           BPDiaAve           BPSysAve 
##       2.205260e+01       1.888392e+02       1.441847e+03       3.017260e+02 
##        AlcoholYear           Smoke100         UrineFlow1    DaysMentHlthBad 
##       4.025376e+02       1.017240e+02       1.845573e+03       1.913533e+03 
##    DaysPhysHlthBad factor(HealthGen)2 factor(HealthGen)3 factor(HealthGen)4 
##       1.441072e+05       9.489023e+01       1.001674e+02       1.069960e+02 
## factor(HealthGen)5         PhysActive 
##       9.394170e+01       1.103767e+02

The estimated regression coefficients doesn’t change slightly after removing these observations. 5 of the estimates have changed by more than 10% after calculation. The p-value for the coefficient forSleepHrsNight is becoming insignificant with 95% confidence level.

multicollinearity ######################

Pearson correlations

var3 = c(
  "BMI",
  "SleepHrsNight",
  "Age",
  "Gender",
  "Race1",
  "TotChol",
  "BPDiaAve",
  "BPSysAve",
  "AlcoholYear",
  "Smoke100",
  "DaysPhysHlthBad",
  "PhysActive",
  "Poverty",
  "UrineFlow1",
  "DaysMentHlthBad",
  "HealthGen"
)
newData3 = df3[, var3]
library("corrplot")
## corrplot 0.92 loaded
par(mfrow = c(1, 2))
cormat3 = cor(as.matrix(newData3[, -c(1)], method = "pearson"))
p.mat3 = cor.mtest(as.matrix(newData3[, -c(1)]))$p
corrplot(
  cormat3,
  method = "color",
  type = "upper",
  number.cex = 1,
  diag = FALSE,
  addCoef.col = "black",
  tl.col = "black",
  tl.srt = 90,
  p.mat = p.mat3,
  sig.level = 0.05,
  insig = "blank",
)

None of the covariates seem strongly correlated.There is no evidence of collinearity from the pair-wise correlations.

collinearity diagnostics (VIF)

car::vif(m_3)
##                       GVIF Df GVIF^(1/(2*Df))
## SleepHrsNight     1.069282  1        1.034061
## Age               1.342254  1        1.158557
## Gender            1.140187  1        1.067795
## factor(Race1)     1.245248  4        1.027796
## Poverty           1.321502  1        1.149566
## TotChol           1.130821  1        1.063400
## BPDiaAve          1.445271  1        1.202194
## BPSysAve          1.562024  1        1.249810
## AlcoholYear       1.122271  1        1.059373
## Smoke100          1.141210  1        1.068274
## UrineFlow1        1.045366  1        1.022432
## DaysMentHlthBad   1.144347  1        1.069742
## DaysPhysHlthBad   1.247355  1        1.116851
## factor(HealthGen) 1.455605  4        1.048046
## PhysActive        1.166136  1        1.079878

From the VIF values in the output above, once again we do not observe any potential collinearity issues. In fact, the VIF values are fairly small: none of the values exceed 10.

MLR & Diagnosis log ########################################

model_3 add additional risk factors ##

df3$logBMI = log(df3$BMI + 1)
m_3 = lm(
  logBMI ~ SleepHrsNight + DaysMentHlthBad+Age + Gender + factor(Race1)  + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 +
    DaysPhysHlthBad + factor(HealthGen) + PhysActive,
  df3
)
summary(m_3)
## 
## Call:
## lm(formula = logBMI ~ SleepHrsNight + DaysMentHlthBad + Age + 
##     Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve + 
##     AlcoholYear + Smoke100 + UrineFlow1 + DaysPhysHlthBad + factor(HealthGen) + 
##     PhysActive, data = df3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.61804 -0.12733 -0.00485  0.12226  0.78086 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.278e+00  5.765e-02  56.863  < 2e-16 ***
## SleepHrsNight      -4.718e-03  3.263e-03  -1.446 0.148321    
## DaysMentHlthBad    -1.265e-03  5.512e-04  -2.295 0.021825 *  
## Age                 4.023e-04  4.192e-04   0.960 0.337200    
## Gender              1.807e-03  8.824e-03   0.205 0.837761    
## factor(Race1)2     -4.739e-02  1.949e-02  -2.432 0.015084 *  
## factor(Race1)3     -2.532e-02  1.706e-02  -1.484 0.138009    
## factor(Race1)4     -4.245e-02  1.293e-02  -3.282 0.001045 ** 
## factor(Race1)5     -9.451e-02  1.904e-02  -4.963 7.45e-07 ***
## Poverty             2.848e-03  2.798e-03   1.018 0.308886    
## TotChol             3.721e-03  4.150e-03   0.897 0.370054    
## BPDiaAve            1.961e-03  4.217e-04   4.649 3.53e-06 ***
## BPSysAve            1.532e-03  3.539e-04   4.331 1.55e-05 ***
## AlcoholYear        -2.676e-04  4.627e-05  -5.784 8.33e-09 ***
## Smoke100           -2.991e-02  8.838e-03  -3.384 0.000726 ***
## UrineFlow1         -2.890e-03  4.354e-03  -0.664 0.506924    
## DaysPhysHlthBad     6.034e-04  6.396e-04   0.943 0.345644    
## factor(HealthGen)2 -8.521e-02  3.065e-02  -2.780 0.005486 ** 
## factor(HealthGen)3 -1.264e-01  3.037e-02  -4.163 3.27e-05 ***
## factor(HealthGen)4 -1.802e-01  3.118e-02  -5.781 8.48e-09 ***
## factor(HealthGen)5 -2.469e-01  3.286e-02  -7.512 8.36e-14 ***
## PhysActive         -2.262e-02  9.021e-03  -2.507 0.012232 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1954 on 2225 degrees of freedom
## Multiple R-squared:  0.1547, Adjusted R-squared:  0.1467 
## F-statistic: 19.39 on 21 and 2225 DF,  p-value: < 2.2e-16
car::Anova(m_3, type = "III")

model 3 diagnosis ###########

par(mfrow = c(2, 3)) #read more from ?plot.lm
plot(m_3, which = 1)
plot(m_3, which = 2)
plot(m_3, which = 3)
plot(m_3, which = 4)
plot(m_3, which = 5)
plot(m_3, which = 6)

par(mfrow = c(1, 1)) # reset
m_3.yhat = m_3$fitted.values
m_3.res = m_3$residuals
m_3.h = hatvalues(m_3)
m_3.r = rstandard(m_3)
m_3.rr = rstudent(m_3)

which subject is most outlying with respect to the x space

Hmisc::describe(m_3.h)
## m_3.h 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2247        0     2247        1 0.009791 0.005107 0.004459 0.004936 
##      .25      .50      .75      .90      .95 
## 0.006261 0.008725 0.011708 0.015200 0.018498 
## 
## lowest : 0.002814313 0.002982653 0.003117284 0.003208262 0.003227126
## highest: 0.038012401 0.042497096 0.048833386 0.048889933 0.055461319
m_3.h[which.max(m_3.h)]
##        422 
## 0.05546132

Assumption:LINE ##############################

(1)Linear: 2 approaches

partial regression plots

car::avPlots(m_3)

(2)Independence:

residuals <- resid(m_3)
acf(residuals, main = "Autocorrelation Function of Residuals")

pacf(residuals, main = "Partial Autocorrelation Function of Residuals")

dw_test <- dwtest(m_3)
print(dw_test)
## 
##  Durbin-Watson test
## 
## data:  m_3
## DW = 2.0148, p-value = 0.6364
## alternative hypothesis: true autocorrelation is greater than 0

(3)E: constant var: residuals-fitted values; transform for variance-stable…(total: 4 solutions)

car::residualPlots(m_3, type = "response")

##                   Test stat Pr(>|Test stat|)    
## SleepHrsNight       -0.7992          0.42429    
## DaysMentHlthBad     -0.2089          0.83458    
## Age                 -4.0747        4.769e-05 ***
## Gender              -0.3071          0.75882    
## factor(Race1)                                   
## Poverty             -0.9680          0.33315    
## TotChol             -1.8454          0.06511 .  
## BPDiaAve             0.5158          0.60601    
## BPSysAve            -4.3661        1.323e-05 ***
## AlcoholYear          1.9986          0.04577 *  
## Smoke100             0.5060          0.61289    
## UrineFlow1          -0.2751          0.78326    
## DaysPhysHlthBad      0.7872          0.43126    
## factor(HealthGen)                               
## PhysActive           0.0674          0.94625    
## Tukey test          -1.8323          0.06691 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(m_3, which = 1)

or

ggplot(m_3, aes(x = m_3.yhat, y = m_3.res)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             color = "black") +
  labs(title = "constant variance assumption",
       x = "y hat",
       y = "Residuals") +
  theme_minimal()

conclusion: the constant variance assumption is basically not violated. The spread of the residuals appears to be fairly uniform across the range of predicted values, the assumption is more likely to hold

(4)Normality: residuals freq - residuals (4 plots: his, box, Q-Q, shapiro); transform

exam quartiles of the residuals

Hmisc::describe(m_3.res)
## m_3.res 
##         n   missing  distinct      Info      Mean       Gmd       .05       .10 
##      2247         0      2247         1 2.528e-18    0.2172  -0.30704  -0.23984 
##       .25       .50       .75       .90       .95 
##  -0.12733  -0.00485   0.12226   0.24533   0.32858 
## 
## lowest : -0.6180372 -0.5974308 -0.5951820 -0.5507425 -0.5295147
## highest:  0.7040679  0.7130134  0.7234443  0.7624020  0.7808639
Hmisc::describe(m_3.res)$counts[c(".25", ".50", ".75")] #not symmetric
##        .25        .50        .75 
## "-0.12733" "-0.00485" " 0.12226"

histogram

par(mfrow = c(1, 1))
hist(m_3.res, breaks = 15)

Q-Q plot

qq.m_3.res = car::qqPlot(m_3.res)

m_3.res[qq.m_3.res]
##       910      1943 
## 0.7808639 0.7624020

influential observations #################

influence3 = data.frame(
  Residual = resid(m_3),
  Rstudent = rstudent(m_3),
  HatDiagH = hat(model.matrix(m_3)),
  CovRatio = covratio(m_3),
  DFFITS = dffits(m_3),
  COOKsDistance = cooks.distance(m_3)
)

DFFITS

ols_plot_dffits(m_3)

influence3[order(abs(influence3$DFFITS), decreasing = T), ] %>% head()

From the plot above, we can see 2 observations with the largest (magnitude) of DFFITS, observation 879 and 1769 By printing the corresponding values of DFFITS in the output dataset, we can obtain their DFFITS values: 0.5673 for observation 879, 0.5872 for observation 1769

Cook’s D

ols_plot_cooksd_bar(m_3)

influence3[order(influence3$COOKsDistance, decreasing = T), ] %>% head()

From the plot above, we can see that the observation 879 and 1769 also have the largest Cook’s Distance. By printing the corresponding values of Cook’s D in the output dataset, we can obtain their Cook’s D values:0.0108 for observation 879, 0.0145 for observation 1769

leverage

ols_plot_resid_lev(m_3)

high leverage

influence3[order(influence3$HatDiagH, decreasing = T), ] %>% head()

high studentized residual

influence3[order(influence3$Rstudent, decreasing = T), ] %>% head()

From the plot above, we can see that the observation 1155 has the largest leverage (0.0368). Observations 1862 has the largest (in magnitude) externally studentized residual (5.9649).

From the plot above, there is 7 observations(1048,1769,1684, 74, 72, 1689, 1311) located in the intersection areas of both outlier and leverage, which is to say, those observations has both the leverage and the externally studentized residual exceeding their respective thresholds.Due to its large DIFFITS and Cook’s D, they are potentially influential observations.

The thresholds for the externally studentized residual are -2 and 2, i.e. 2 in magnitude. The thresholds for the leverage of the R default is 0.011

From (DFFITS), observations 879 and 1769 appear to be influential observations. Observation 1155 has extraordinarily large leverage. Therefore, I choose to remove these 14 observations in the re-fitted mode

rm3.df3 = df3[-c(444, 926, 1348, 1454, 1655, 1910, 1958), ]
rm.m_3 =  lm(
  logBMI ~ SleepHrsNight + Age + Gender + Race1  + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
    DaysPhysHlthBad + HealthGen + PhysActive,
  rm3.df3
)

Before removing these observations, the estimated coefficients are:

summary(m_3)$coef
##                         Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept)         3.2784100219 0.0576543432 56.8631926 0.000000e+00
## SleepHrsNight      -0.0047181234 0.0032629012 -1.4459903 1.483208e-01
## DaysMentHlthBad    -0.0012649713 0.0005511844 -2.2950057 2.182547e-02
## Age                 0.0004023475 0.0004191475  0.9599186 3.372005e-01
## Gender              0.0018070868 0.0088244307  0.2047823 8.377610e-01
## factor(Race1)2     -0.0473922190 0.0194850641 -2.4322332 1.508442e-02
## factor(Race1)3     -0.0253196710 0.0170642891 -1.4837812 1.380086e-01
## factor(Race1)4     -0.0424502990 0.0129324924 -3.2824530 1.045004e-03
## factor(Race1)5     -0.0945148238 0.0190424882 -4.9633652 7.453716e-07
## Poverty             0.0028481217 0.0027983246  1.0177953 3.088859e-01
## TotChol             0.0037207960 0.0041501098  0.8965536 3.700541e-01
## BPDiaAve            0.0019606291 0.0004217539  4.6487515 3.534993e-06
## BPSysAve            0.0015324057 0.0003538465  4.3307081 1.552264e-05
## AlcoholYear        -0.0002676120 0.0000462692 -5.7838049 8.332325e-09
## Smoke100           -0.0299111317 0.0088377923 -3.3844574 7.255600e-04
## UrineFlow1         -0.0028901130 0.0043542738 -0.6637417 5.069244e-01
## DaysPhysHlthBad     0.0006033501 0.0006396345  0.9432733 3.456435e-01
## factor(HealthGen)2 -0.0852089875 0.0306535477 -2.7797431 5.485942e-03
## factor(HealthGen)3 -0.1264388994 0.0303746374 -4.1626472 3.265555e-05
## factor(HealthGen)4 -0.1802336838 0.0311780723 -5.7807834 8.481117e-09
## factor(HealthGen)5 -0.2468663127 0.0328615664 -7.5123112 8.362784e-14
## PhysActive         -0.0226201539 0.0090212949 -2.5074176 1.223242e-02

After removing these observations, the estimated coefficients are:

summary(rm.m_3)$coef
##                      Estimate   Std. Error     t value     Pr(>|t|)
## (Intercept)      3.3050560522 5.102395e-02  64.7746013 0.000000e+00
## SleepHrsNight   -0.0047524563 3.249225e-03  -1.4626432 1.437063e-01
## Age              0.0004216798 4.153499e-04   1.0152398 3.101019e-01
## Gender           0.0032635982 8.718949e-03   0.3743110 7.082087e-01
## Race1           -0.0153799757 3.699991e-03  -4.1567606 3.350196e-05
## Poverty          0.0033436926 2.742497e-03   1.2192148 2.228920e-01
## TotChol          0.0038474226 4.117724e-03   0.9343565 3.502215e-01
## BPDiaAve         0.0021017603 4.252063e-04   4.9429193 8.270640e-07
## BPSysAve         0.0015086789 3.531594e-04   4.2719493 2.019304e-05
## AlcoholYear     -0.0002568527 4.589462e-05  -5.5965757 2.455559e-08
## Smoke100        -0.0271010352 8.750036e-03  -3.0972485 1.977648e-03
## UrineFlow1      -0.0031016415 4.327971e-03  -0.7166502 4.736652e-01
## DaysMentHlthBad -0.0013536816 5.475215e-04  -2.4723807 1.349595e-02
## DaysPhysHlthBad  0.0006020118 6.186968e-04   0.9730321 3.306432e-01
## HealthGen       -0.0521366128 4.975551e-03 -10.4785605 4.123413e-25
## PhysActive      -0.0223651970 8.959716e-03  -2.4961948 1.262525e-02

change percent

abs((rm.m_3$coefficients - m_3$coefficients) / (m_3$coefficients) * 100)
##        (Intercept)      SleepHrsNight    DaysMentHlthBad                Age 
##       8.127730e-01       7.276813e-01       1.333351e+02       7.111391e+02 
##             Gender     factor(Race1)2     factor(Race1)3     factor(Race1)4 
##       9.510922e+02       1.070554e+02       1.151954e+02       1.049511e+02 
##     factor(Race1)5            Poverty            TotChol           BPDiaAve 
##       1.015962e+02       1.090183e+02       8.283666e+02       2.581962e+02 
##           BPSysAve        AlcoholYear           Smoke100         UrineFlow1 
##       1.883370e+02       3.249570e+02       7.430505e+01       6.738520e+02 
##    DaysPhysHlthBad factor(HealthGen)2 factor(HealthGen)3 factor(HealthGen)4 
##       5.476841e+05       9.442259e+01       1.003335e+02       1.018108e+02 
## factor(HealthGen)5         PhysActive 
##       9.376992e+01       1.147819e+02

The estimated regression coefficients doesn’t change slightly after removing these observations. 5 of the estimates have changed by more than 10% after calculation. The p-value for the coefficient forSleepHrsNight is becoming insignificant with 95% confidence level.

multicollinearity ######################

Pearson correlations

var3 = c(
  "BMI",
  "SleepHrsNight",
  "Age",
  "Gender",
  "Race1",
  "TotChol",
  "BPDiaAve",
  "BPSysAve",
  "AlcoholYear",
  "Smoke100",
  "DaysPhysHlthBad",
  "PhysActive",
  "Poverty",
  "UrineFlow1",
  "DaysMentHlthBad",
  "HealthGen"
)
newData3 = df3[, var3]
library("corrplot")
par(mfrow = c(1, 2))
cormat3 = cor(as.matrix(newData3[, -c(1)], method = "pearson"))
p.mat3 = cor.mtest(as.matrix(newData3[, -c(1)]))$p
corrplot(
  cormat3,
  method = "color",
  type = "upper",
  number.cex = 1,
  diag = FALSE,
  addCoef.col = "black",
  tl.col = "black",
  tl.srt = 90,
  p.mat = p.mat3,
  sig.level = 0.05,
  insig = "blank",
)

None of the covariates seem strongly correlated.There is no evidence of collinearity from the pair-wise correlations.

collinearity diagnostics (VIF)

car::vif(m_3)
##                       GVIF Df GVIF^(1/(2*Df))
## SleepHrsNight     1.069282  1        1.034061
## DaysMentHlthBad   1.144347  1        1.069742
## Age               1.342254  1        1.158557
## Gender            1.140187  1        1.067795
## factor(Race1)     1.245248  4        1.027796
## Poverty           1.321502  1        1.149566
## TotChol           1.130821  1        1.063400
## BPDiaAve          1.445271  1        1.202194
## BPSysAve          1.562024  1        1.249810
## AlcoholYear       1.122271  1        1.059373
## Smoke100          1.141210  1        1.068274
## UrineFlow1        1.045366  1        1.022432
## DaysPhysHlthBad   1.247355  1        1.116851
## factor(HealthGen) 1.455605  4        1.048046
## PhysActive        1.166136  1        1.079878

From the VIF values in the output above, once again we do not observe any potential collinearity issues. In fact, the VIF values are fairly small: none of the values exceed 10.

MLR & Diagnosis with interaction ########################################

model_4 add additional risk factors ##

df3$logBMI = log(df3$BMI + 1)
m_full = lm(
  logBMI ~ SleepHrsNight + DaysMentHlthBad+ Age + Gender + factor(Race1)  + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 +
    DaysPhysHlthBad + factor(HealthGen) + PhysActive + SleepHrsNight*Age + SleepHrsNight*Gender,
  df3
)
summary(m_full)
## 
## Call:
## lm(formula = logBMI ~ SleepHrsNight + DaysMentHlthBad + Age + 
##     Gender + factor(Race1) + Poverty + TotChol + BPDiaAve + BPSysAve + 
##     AlcoholYear + Smoke100 + UrineFlow1 + DaysPhysHlthBad + factor(HealthGen) + 
##     PhysActive + SleepHrsNight * Age + SleepHrsNight * Gender, 
##     data = df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6234 -0.1294 -0.0049  0.1206  0.8062 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3.3688443  0.0964199  34.939  < 2e-16 ***
## SleepHrsNight        -0.0177956  0.0115858  -1.536 0.124685    
## DaysMentHlthBad      -0.0012489  0.0005503  -2.269 0.023341 *  
## Age                  -0.0033337  0.0019138  -1.742 0.081656 .  
## Gender                0.1152578  0.0440267   2.618 0.008907 ** 
## factor(Race1)2       -0.0485063  0.0194505  -2.494 0.012709 *  
## factor(Race1)3       -0.0256227  0.0170349  -1.504 0.132690    
## factor(Race1)4       -0.0431426  0.0129089  -3.342 0.000845 ***
## factor(Race1)5       -0.0930094  0.0190114  -4.892 1.07e-06 ***
## Poverty               0.0028324  0.0027972   1.013 0.311366    
## TotChol               0.0033009  0.0041444   0.796 0.425845    
## BPDiaAve              0.0019674  0.0004209   4.674 3.13e-06 ***
## BPSysAve              0.0015546  0.0003532   4.401 1.13e-05 ***
## AlcoholYear          -0.0002695  0.0000462  -5.833 6.23e-09 ***
## Smoke100             -0.0299332  0.0088214  -3.393 0.000703 ***
## UrineFlow1           -0.0027372  0.0043481  -0.630 0.529077    
## DaysPhysHlthBad       0.0006245  0.0006386   0.978 0.328195    
## factor(HealthGen)2   -0.0860758  0.0306348  -2.810 0.005002 ** 
## factor(HealthGen)3   -0.1279830  0.0303267  -4.220 2.54e-05 ***
## factor(HealthGen)4   -0.1808754  0.0311307  -5.810 7.14e-09 ***
## factor(HealthGen)5   -0.2476517  0.0328054  -7.549 6.35e-14 ***
## PhysActive           -0.0231446  0.0090319  -2.563 0.010456 *  
## SleepHrsNight:Age     0.0005509  0.0002746   2.006 0.044979 *  
## SleepHrsNight:Gender -0.0166893  0.0063397  -2.632 0.008535 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.195 on 2223 degrees of freedom
## Multiple R-squared:  0.1587, Adjusted R-squared:   0.15 
## F-statistic: 18.24 on 23 and 2223 DF,  p-value: < 2.2e-16
car::Anova(m_full, type = "III")
Confint(m_full)
##                           Estimate         2.5 %        97.5 %
## (Intercept)           3.3688442791  3.179762e+00  3.5579268348
## SleepHrsNight        -0.0177956336 -4.051579e-02  0.0049245242
## DaysMentHlthBad      -0.0012489159 -2.328143e-03 -0.0001696883
## Age                  -0.0033336861 -7.086650e-03  0.0004192774
## Gender                0.1152578004  2.892000e-02  0.2015956012
## factor(Race1)2       -0.0485062616 -8.664922e-02 -0.0103633015
## factor(Race1)3       -0.0256226809 -5.902864e-02  0.0077832799
## factor(Race1)4       -0.0431425559 -6.845724e-02 -0.0178278742
## factor(Race1)5       -0.0930094175 -1.302913e-01 -0.0557275358
## Poverty               0.0028324405 -2.653000e-03  0.0083178808
## TotChol               0.0033008956 -4.826430e-03  0.0114282217
## BPDiaAve              0.0019673653  1.141908e-03  0.0027928229
## BPSysAve              0.0015546299  8.619339e-04  0.0022473259
## AlcoholYear          -0.0002694855 -3.600834e-04 -0.0001788876
## Smoke100             -0.0299332383 -4.723231e-02 -0.0126341685
## UrineFlow1           -0.0027371557 -1.126381e-02  0.0057895026
## DaysPhysHlthBad       0.0006245193 -6.277692e-04  0.0018768079
## factor(HealthGen)2   -0.0860758400 -1.461517e-01 -0.0259999717
## factor(HealthGen)3   -0.1279829527 -1.874546e-01 -0.0685113157
## factor(HealthGen)4   -0.1808753562 -2.419237e-01 -0.1198269982
## factor(HealthGen)5   -0.2476517313 -3.119841e-01 -0.1833193210
## PhysActive           -0.0231445878 -4.085637e-02 -0.0054328082
## SleepHrsNight:Age     0.0005509040  1.234621e-05  0.0010894618
## SleepHrsNight:Gender -0.0166893149 -2.912172e-02 -0.0042569113

model 4 diagnosis ###########

par(mfrow = c(2, 3)) #read more from ?plot.lm
plot(m_full, which = 1)
plot(m_full, which = 2)
plot(m_full, which = 3)
plot(m_full, which = 4)
plot(m_full, which = 5)
plot(m_full, which = 6)

par(mfrow = c(1, 1)) # reset
m_full.yhat = m_full$fitted.values
m_full.res = m_full$residuals
m_full.h = hatvalues(m_full)
m_full.r = rstandard(m_full)
m_full.rr = rstudent(m_full)

which subject is most outlying with respect to the x space

Hmisc::describe(m_full.h)
## m_full.h 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2247        0     2247        1  0.01068 0.005784 0.004624 0.005195 
##      .25      .50      .75      .90      .95 
## 0.006784 0.009383 0.012802 0.017108 0.021401 
## 
## lowest : 0.002841614 0.002995724 0.003127421 0.003231272 0.003285363
## highest: 0.046156742 0.046910462 0.048836799 0.049628556 0.060943543
m_full.h[which.max(m_full.h)]
##        422 
## 0.06094354

Assumption:LINE ##############################

(1)Linear: 2 approaches

partial regression plots

car::avPlots(m_full)

(2)Independence:

residuals <- resid(m_full)
acf(residuals, main = "Autocorrelation Function of Residuals")

pacf(residuals, main = "Partial Autocorrelation Function of Residuals")

dw_test <- dwtest(m_full)
print(dw_test)
## 
##  Durbin-Watson test
## 
## data:  m_full
## DW = 2.0126, p-value = 0.6159
## alternative hypothesis: true autocorrelation is greater than 0

(3)E: constant var: residuals-fitted values; transform for variance-stable…(total: 4 solutions)

car::residualPlots(m_full, type = "response")

##                   Test stat Pr(>|Test stat|)    
## SleepHrsNight       -0.2860          0.77489    
## DaysMentHlthBad     -0.3074          0.75854    
## Age                 -4.0272        5.835e-05 ***
## Gender              -0.2921          0.77024    
## factor(Race1)                                   
## Poverty             -0.9911          0.32176    
## TotChol             -1.8770          0.06065 .  
## BPDiaAve             0.5684          0.56985    
## BPSysAve            -4.3959        1.155e-05 ***
## AlcoholYear          2.0151          0.04401 *  
## Smoke100             0.5031          0.61493    
## UrineFlow1          -0.3027          0.76215    
## DaysPhysHlthBad      0.9147          0.36044    
## factor(HealthGen)                               
## PhysActive           0.1306          0.89611    
## Tukey test          -1.2992          0.19389    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(m_full, which = 1)

or

ggplot(m_full, aes(x = m_full.yhat, y = m_full.res)) +
  geom_point(color = "blue", alpha = 0.8) +
  geom_hline(yintercept = 0,
             linetype = "dashed",
             color = "black") +
  labs(title = "constant variance assumption",
       x = "y hat",
       y = "Residuals") +
  theme_minimal()

conclusion: the constant variance assumption is basically not violated. The spread of the residuals appears to be fairly uniform across the range of predicted values, the assumption is more likely to hold

(4)Normality: residuals freq - residuals (4 plots: his, box, Q-Q, shapiro); transform

exam quartiles of the residuals

Hmisc::describe(m_full.res)
## m_full.res 
##         n   missing  distinct      Info      Mean       Gmd       .05       .10 
##      2247         0      2247         1 1.652e-18    0.2165 -0.302479 -0.240208 
##       .25       .50       .75       .90       .95 
## -0.129418 -0.004903  0.120596  0.244933  0.328033 
## 
## lowest : -0.6233869 -0.6167844 -0.6131555 -0.5549558 -0.5291444
## highest:  0.7058034  0.7180306  0.7472944  0.7760889  0.8062185
Hmisc::describe(m_full.res)$counts[c(".25", ".50", ".75")] #not symmetric
##         .25         .50         .75 
## "-0.129418" "-0.004903" " 0.120596"

histogram

par(mfrow = c(1, 1))
hist(m_full.res, breaks = 15)

Q-Q plot

qq.m_full.res = car::qqPlot(m_full.res)

m_full.res[qq.m_full.res]
##       910      1943 
## 0.8062185 0.7760889

influential observations #################

influence4 = data.frame(
  Residual = resid(m_full),
  Rstudent = rstudent(m_full),
  HatDiagH = hat(model.matrix(m_full)),
  CovRatio = covratio(m_full),
  DFFITS = dffits(m_full),
  COOKsDistance = cooks.distance(m_full)
)

DFFITS

ols_plot_dffits(m_full)

influence4[order(abs(influence4$DFFITS), decreasing = T), ] %>% head()

From the plot above, we can see 2 observations with the largest (magnitude) of DFFITS, observation 879 and 1769 By printing the corresponding values of DFFITS in the output dataset, we can obtain their DFFITS values: 0.5673 for observation 879, 0.5872 for observation 1769

Cook’s D

ols_plot_cooksd_bar(m_full)

influence4[order(influence4$COOKsDistance,decreasing = T),] %>% head()

From the plot above, there is 7 observations(1048,1769,1684, 74, 72, 1689, 1311) located in the intersection areas of both outlier and leverage, which is to say, those observations has both the leverage and the externally studentized residual exceeding their respective thresholds.Due to its large DIFFITS and Cook’s D, they are potentially influential observations.

The thresholds for the externally studentized residual are -2 and 2, i.e. 2 in magnitude. The thresholds for the leverage of the R default is 0.011

leverage

ols_plot_resid_lev(m_full)

high leverage

influence4[order(influence4$HatDiagH,decreasing = T),] %>% head()

high studentized residual

influence4[order(influence4$Rstudent,decreasing = T),] %>% head()

From (DFFITS), observations 879 and 1769 appear to be influential observations. Observation 1155 has extraordinarily large leverage. Therefore, I choose to remove these 14 observations in the re-fitted mode

rm4.df3 = df3[-c(1048, 1769, 1684, 74, 72, 1689, 1311, 879), ]
rm.m_full =  lm(
  logBMI ~ SleepHrsNight + Age + Gender + factor(Race1)  + Poverty + TotChol + BPDiaAve + BPSysAve + AlcoholYear + Smoke100 + UrineFlow1 + DaysMentHlthBad +
    DaysPhysHlthBad + factor(HealthGen) + PhysActive + SleepHrsNight*Age + SleepHrsNight*Gender,
  rm4.df3
)

Before removing these observations, the estimated coefficients are:

summary(m_full)$coef
##                           Estimate   Std. Error    t value      Pr(>|t|)
## (Intercept)           3.3688442791 9.641994e-02 34.9392910 1.463092e-213
## SleepHrsNight        -0.0177956336 1.158582e-02 -1.5359841  1.246846e-01
## DaysMentHlthBad      -0.0012489159 5.503366e-04 -2.2693673  2.334143e-02
## Age                  -0.0033336861 1.913770e-03 -1.7419471  8.165600e-02
## Gender                0.1152578004 4.402672e-02  2.6179055  8.907161e-03
## factor(Race1)2       -0.0485062616 1.945046e-02 -2.4938368  1.270922e-02
## factor(Race1)3       -0.0256226809 1.703489e-02 -1.5041295  1.326901e-01
## factor(Race1)4       -0.0431425559 1.290886e-02 -3.3420890  8.452748e-04
## factor(Race1)5       -0.0930094175 1.901136e-02 -4.8923072  1.068055e-06
## Poverty               0.0028324405 2.797222e-03  1.0125906  3.113659e-01
## TotChol               0.0033008956 4.144413e-03  0.7964687  4.258447e-01
## BPDiaAve              0.0019673653 4.209303e-04  4.6738508  3.132952e-06
## BPSysAve              0.0015546299 3.532304e-04  4.4011781  1.127568e-05
## AlcoholYear          -0.0002694855 4.619909e-05 -5.8331340  6.234466e-09
## Smoke100             -0.0299332383 8.821413e-03 -3.3932477  7.027982e-04
## UrineFlow1           -0.0027371557 4.348047e-03 -0.6295138  5.290774e-01
## DaysPhysHlthBad       0.0006245193 6.385866e-04  0.9779712  3.281950e-01
## factor(HealthGen)2   -0.0860758400 3.063483e-02 -2.8097380  5.001570e-03
## factor(HealthGen)3   -0.1279829527 3.032671e-02 -4.2201399  2.539608e-05
## factor(HealthGen)4   -0.1808753562 3.113073e-02 -5.8101859  7.137490e-09
## factor(HealthGen)5   -0.2476517313 3.280539e-02 -7.5491171  6.354907e-14
## PhysActive           -0.0231445878 9.031868e-03 -2.5625472  1.045608e-02
## SleepHrsNight:Age     0.0005509040 2.746298e-04  2.0059875  4.497853e-02
## SleepHrsNight:Gender -0.0166893149 6.339726e-03 -2.6324978  8.534622e-03

After removing these observations, the estimated coefficients are:

summary(rm.m_full)$coef
##                           Estimate   Std. Error    t value      Pr(>|t|)
## (Intercept)           3.3680640105 9.653194e-02 34.8906707 6.115538e-213
## SleepHrsNight        -0.0176701164 1.160014e-02 -1.5232675  1.278346e-01
## Age                  -0.0033187735 1.916556e-03 -1.7316340  8.347797e-02
## Gender                0.1173024056 4.416962e-02  2.6557260  7.970518e-03
## factor(Race1)2       -0.0497356294 1.946971e-02 -2.5545132  1.070000e-02
## factor(Race1)3       -0.0261341401 1.706844e-02 -1.5311386  1.258779e-01
## factor(Race1)4       -0.0448559732 1.294335e-02 -3.4655616  5.391989e-04
## factor(Race1)5       -0.0941809010 1.903251e-02 -4.9484215  8.045103e-07
## Poverty               0.0029605378 2.804053e-03  1.0558065  2.911717e-01
## TotChol               0.0033426614 4.155265e-03  0.8044400  4.212293e-01
## BPDiaAve              0.0019915617 4.218504e-04  4.7210139  2.493370e-06
## BPSysAve              0.0015362161 3.539458e-04  4.3402580  1.487146e-05
## AlcoholYear          -0.0002697270 4.626138e-05 -5.8305013  6.335210e-09
## Smoke100             -0.0297821540 8.841691e-03 -3.3683777  7.690604e-04
## UrineFlow1           -0.0028886485 4.392151e-03 -0.6576842  5.108094e-01
## DaysMentHlthBad      -0.0011669861 5.544332e-04 -2.1048274  3.541871e-02
## DaysPhysHlthBad       0.0006043782 6.395009e-04  0.9450779  3.447222e-01
## factor(HealthGen)2   -0.0867078232 3.066055e-02 -2.8279933  4.726192e-03
## factor(HealthGen)3   -0.1270707356 3.034738e-02 -4.1872055  2.934592e-05
## factor(HealthGen)4   -0.1799745606 3.115368e-02 -5.7769919  8.676582e-09
## factor(HealthGen)5   -0.2467543622 3.282786e-02 -7.5166149  8.111908e-14
## PhysActive           -0.0238843563 9.046253e-03 -2.6402486  8.342681e-03
## SleepHrsNight:Age     0.0005506386 2.750729e-04  2.0017913  4.542893e-02
## SleepHrsNight:Gender -0.0169358141 6.359509e-03 -2.6630694  7.799202e-03

change percent

abs((rm.m_full$coefficients - m_full$coefficients) / (m_full$coefficients) * 100)
##          (Intercept)        SleepHrsNight                  Age 
##         2.316131e-02         7.053254e-01         1.657323e+02 
##               Gender       factor(Race1)2       factor(Race1)3 
##         3.618700e+03         1.431516e+02         4.612213e+01 
##       factor(Race1)4       factor(Race1)5              Poverty 
##         7.506354e+01         1.183016e+02         1.031831e+02 
##              TotChol             BPDiaAve             BPSysAve 
##         1.801347e+01         3.966602e+01         2.191505e+01 
##          AlcoholYear             Smoke100           UrineFlow1 
##         1.173499e+02         1.095149e+04         9.034970e+01 
##      DaysMentHlthBad      DaysPhysHlthBad   factor(HealthGen)2 
##         5.736501e+01         3.225060e+00         7.342166e-01 
##   factor(HealthGen)3   factor(HealthGen)4   factor(HealthGen)5 
##         7.127646e-01         4.980201e-01         3.623512e-01 
##           PhysActive    SleepHrsNight:Age SleepHrsNight:Gender 
##         3.196291e+00         4.817988e-02         1.476988e+00

The estimated regression coefficients doesn’t change slightly after removing these observations. 5 of the estimates have changed by more than 10% after calculation. The p-value for the coefficient forSleepHrsNight is becoming insignificant with 95% confidence level.

multicollinearity ######################

Pearson correlations

var4 = c(
  "BMI",
  "SleepHrsNight",
  "Age",
  "Gender",
  "Race1",
  "TotChol",
  "BPDiaAve",
  "BPSysAve",
  "AlcoholYear",
  "Smoke100",
  "DaysPhysHlthBad",
  "PhysActive",
  "Poverty",
  "UrineFlow1",
  "DaysMentHlthBad",
  "HealthGen"
)
newData4 = df3[, var4]
library("corrplot")
par(mfrow = c(1, 2))
cormat4 = cor(as.matrix(newData4[, -c(1)], method = "pearson"))
p.mat4 = cor.mtest(as.matrix(newData4[, -c(1)]))$p
corrplot(
  cormat4,
  method = "color",
  type = "upper",
  number.cex = 1,
  diag = FALSE,
  addCoef.col = "black",
  tl.col = "black",
  tl.srt = 90,
  p.mat = p.mat4,
  sig.level = 0.05,
  insig = "blank",
)

None of the covariates seem strongly correlated.There is no evidence of collinearity from the pair-wise correlations.

collinearity diagnostics (VIF)

car::vif(m_full)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
##                           GVIF Df GVIF^(1/(2*Df))
## SleepHrsNight        13.534700  1        3.678954
## DaysMentHlthBad       1.145332  1        1.070202
## Age                  28.092544  1        5.300240
## Gender               28.493511  1        5.337931
## factor(Race1)         1.248932  4        1.028176
## Poverty               1.325671  1        1.151378
## TotChol               1.132169  1        1.064034
## BPDiaAve              1.445314  1        1.202212
## BPSysAve              1.562733  1        1.250093
## AlcoholYear           1.123289  1        1.059853
## Smoke100              1.141471  1        1.068397
## UrineFlow1            1.046493  1        1.022982
## DaysPhysHlthBad       1.248178  1        1.117219
## factor(HealthGen)     1.465793  4        1.048960
## PhysActive            1.173484  1        1.083275
## SleepHrsNight:Age    37.290593  1        6.106602
## SleepHrsNight:Gender 30.038031  1        5.480696

From the VIF values in the output above, once again we do not observe any potential collinearity issues. In fact, the VIF values are fairly small: none of the values exceed 10.

getMode <- function(v) {
  uniqv <- unique(v)
  uniqv[which.max(tabulate(match(v, uniqv)))]
}
new_data <- expand.grid(SleepHrsNight = seq(min(df3$SleepHrsNight), max(df3$SleepHrsNight), length.out = 100),
                        Age = quantile(df3$Age, probs = c(0.25, 0.5, 0.75)),
                        Gender = median(df3$Gender, na.rm = TRUE),
                        Race1 = median(df3$Race1, na.rm = TRUE),
                        Poverty = median(df3$Poverty, na.rm = TRUE),
                        TotChol = median(df3$TotChol, na.rm = TRUE),
                        BPDiaAve = median(df3$BPDiaAve, na.rm = TRUE),
                        BPSysAve = median(df3$BPSysAve, na.rm = TRUE),
                        AlcoholYear = median(df3$AlcoholYear, na.rm = TRUE),
                        Smoke100 = getMode(df3$Smoke100),
                        UrineFlow1 = median(df3$UrineFlow1, na.rm = TRUE),
                        DaysMentHlthBad = median(df3$DaysMentHlthBad, na.rm = TRUE),
                        DaysPhysHlthBad = median(df3$DaysPhysHlthBad, na.rm = TRUE),
                        HealthGen = getMode(df3$HealthGen),
                        PhysActive = getMode(df3$PhysActive)
)

predict

new_data$predicted_BMI <- predict(m_full, newdata = new_data)

interaction

library(ggplot2)
ggplot(new_data, aes(x = SleepHrsNight, y = predicted_BMI, group = factor(Age))) +
  geom_line(aes(color = factor(Age))) +
  labs(title = "Interaction between Sleep Hours and Age on BMI",
       x = "Sleep Hours per Night",
       y = "Predicted BMI")

library(ggplot2)
ggplot(new_data, aes(x = SleepHrsNight, y = predicted_BMI, group = factor(Gender))) +
  geom_line(aes(color = factor(Gender))) +
  labs(title = "Interaction between Sleep Hours and Gender on BMI",
       x = "Sleep Hours per Night",
       y = "Predicted BMI")

cross validation

library(caret)
## Loading required package: lattice
splitIndex <-
  createDataPartition(df3$SleepHrsNight, p = 0.7, list = FALSE)
trainData <- df3[splitIndex, ]
testData <- df3[-splitIndex, ]
predictions <- predict(m_full, newdata = testData)
mse <- mean((testData$SleepHrsNight - predictions) ^ 2)
control <-
  trainControl(method = "cv", number = 10)  # 10-fold cross-validation
cv_model <-
  train(
    SleepHrsNight ~ .,
    data = df3,
    method = "lm",
    trControl = control
  )
cv_model
## Linear Regression 
## 
## 2247 samples
##   16 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 2022, 2022, 2021, 2022, 2023, 2022, ... 
## Resampling results:
## 
##   RMSE      Rsquared    MAE      
##   1.276784  0.05185099  0.9954254
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
(cv_results <- cv_model$results)